Candace Savonen - CCDL for ALSF
This notebook sets up the MAF data files for comparison and does some first line analyses.
It is the first notebook in this series which addresses issue # 30 in OpenPBTA.
To run this from the command line, use:
Rscript -e "rmarkdown::render('analyses/mutect2-vs-strelka2/01-set-up.Rmd',
clean = TRUE)"
This assumes you are in the top directory of the repository.
# We need maftools - this will be added to the running Docker issue whenever it is up
if (!("maftools" %in% installed.packages())) {
devtools::install_github("PoisonAlien/maftools")
}
Get magrittr pipe
`%>%` <- dplyr::`%>%`
Path to the symlinked data obtained via bash download-data.sh.
data_dir <- file.path("..", "..", "data")
scratch_dir <- file.path("..", "..", "scratch")
Create output directories in this analysis folder.
if (!dir.exists("results")) {
dir.create("results")
}
if (!dir.exists("plots")) {
dir.create("plots")
}
Running maftools::read.maf takes a lot of computing power and time, so to avoid having to run this for both datasets everytime we want to re-run this notebook or the analyses in the other notebook, I’ve set this up to save the MAF objects as RDS files.
First let’s establish the file paths.
# File paths for the needed files for this analysis
metadata_dir <- file.path(scratch_dir, "metadata_filtered_maf_samples.tsv")
strelka2_dir <- file.path(scratch_dir, "strelka2.RDS")
mutect2_dir <- file.path(scratch_dir, "mutect2.RDS")
Will read in as an maftools object from an RDS file, unless maftools has not been run on them yet. Establish whether the files we need for this already exist before running it again.
If you trying to run the set up step in a Docker container, it will likely be out of memory killed, unless you have ~50GB you can allot to Docker.
Prep the metadata to be used as the clinicalData for maftools it it hasn’t been prepped yet.
# Get a vector of whether these exist
files_needed <- file.exists(metadata_dir, strelka2_dir, mutect2_dir)
if (all(files_needed)) {
# Read the ready-to-go files if these files exist
metadata <- metadata <- readr::read_tsv(metadata_dir)
strelka2 <- readRDS(strelka2_dir)
mutect2 <- readRDS(mutect2_dir)
} else { # If any of the needed files don't exist, rerun this process:
# Only import the sample names
strelka2_samples <- data.table::fread(file.path(
data_dir,
"pbta-snv-strelka2.vep.maf.gz"
),
select = "Tumor_Sample_Barcode",
skip = 1,
data.table = FALSE
)
mutect2_samples <- data.table::fread(file.path(
data_dir,
"pbta-snv-mutect2.vep.maf.gz"
),
select = "Tumor_Sample_Barcode",
skip = 1,
data.table = FALSE
)
# Isolate metadata to only the samples that are in the datasets
metadata <- readr::read_tsv(data_dir, "pbta-histologies.tsv") %>%
dplyr::filter(Kids_First_Biospecimen_ID %in% c(strelka2_samples, mutect2_samples)) %>%
dplyr::distinct(Kids_First_Biospecimen_ID, .keep_all = TRUE) %>%
dplyr::arrange() %>%
readr::write_tsv(file.path(scratch_dir, "metadata_filtered_maf_samples.tsv"))
# Read in original strelka file with maftools
strelka <- maftools::read.maf(file.path(data_dir, "pbta-snv-strelka2.vep.maf.gz"),
clinicalData = metadata
)
# Save to RDS so we don't have to run this again
saveRDS(strelka, strelka2_dir)
# Same for MuTect2
mutect2 <- maftools::read.maf(file.path(data_dir, "pbta-snv-mutect2.vep.maf.gz"),
clinicalData = metadata
)
saveRDS(mutect2, mutect2_dir)
}
Parsed with column specification:
cols(
.default = col_character(),
age_at_diagnosis = [32mcol_double()[39m,
molecular_subtype = [33mcol_logical()[39m
)
See spec(...) for full column specifications.
Get gene summaries and write to TSV files.
strelka2_gene_sum <- maftools::getGeneSummary(strelka2) %>%
readr::write_tsv(file.path(
"results",
"strelka2_gene_summary.tsv"
))
mutect2_gene_sum <- maftools::getGeneSummary(mutect2) %>%
readr::write_tsv(file.path(
"results",
"mutect2_gene_summary.tsv"
))
Get sample summaries and write to TSV files.
strelka2_sample_sum <- maftools::getSampleSummary(strelka2) %>%
readr::write_tsv(file.path(
"results",
"strelka2_sample_summary.tsv"
))
mutect2_sample_sum <- maftools::getSampleSummary(mutect2) %>%
readr::write_tsv(file.path(
"results",
"mutect2_sample_summary.tsv"
))
combined_gene <- mutect2_gene_sum %>%
dplyr::full_join(strelka2_gene_sum, by = "Hugo_Symbol") %>%
reshape2::melt(id = "Hugo_Symbol") %>%
dplyr::mutate(dataset = as.character(grepl(".x$", variable))) %>%
dplyr::mutate(dataset = dplyr::recode(dataset,
`TRUE` = "mutect2",
`FALSE` = "strelka2"
)) %>%
dplyr::mutate(variable = gsub(".x$|.y$", "", variable)) %>%
tidyr::spread("dataset", "value")
Let’s get a correlation test on the genes overall.
cor.test(combined_gene$mutect2, combined_gene$strelka2, method = "spearman")
Cannot compute exact p-value with ties
Spearman's rank correlation rho
data: combined_gene$mutect2 and combined_gene$strelka2
S = 4.7233e+13, p-value < 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho
0.9568251
cor.test(combined_gene$mutect2, combined_gene$strelka2, method = "pearson")
Pearson's product-moment correlation
data: combined_gene$mutect2 and combined_gene$strelka2
t = 567.61, df = 187234, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.7935980 0.7969277
sample estimates:
cor
0.7952689
combined_sample <- mutect2_sample_sum %>%
dplyr::full_join(strelka2_sample_sum, by = "Tumor_Sample_Barcode") %>%
reshape2::melt(id = "Tumor_Sample_Barcode") %>%
dplyr::mutate(dataset = as.character(grepl(".x$", variable))) %>%
dplyr::mutate(dataset = dplyr::recode(dataset,
`TRUE` = "mutect2",
`FALSE` = "strelka2"
)) %>%
dplyr::mutate(variable = gsub(".x$|.y$", "", variable)) %>%
tidyr::spread("dataset", "value")
Let’s get a correlation test on the genes overall.
cor.test(combined_sample$mutect2, combined_sample$strelka2, method = "spearman")
Cannot compute exact p-value with ties
Spearman's rank correlation rho
data: combined_sample$mutect2 and combined_sample$strelka2
S = 3.3543e+10, p-value < 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho
0.7808425
cor.test(combined_sample$mutect2, combined_sample$strelka2, method = "pearson")
Pearson's product-moment correlation
data: combined_sample$mutect2 and combined_sample$strelka2
t = 778.22, df = 9718, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.9917519 0.9923801
sample estimates:
cor
0.9920722
maftools::plotTiTv(maftools::titv(strelka2))
maftools::plotTiTv(maftools::titv(mutect2))
Here we will make these new variables for both Mutect2 and Strelka2 dataset: - Calculate VAF for each - Make a mutation ID by concatenating gene name, allele, tumor ID, and start position - Summarize the biotype variable for whether or not it is a coding gene.
Let’s do this for Strelka2 first.
strelka2_vaf <- strelka2@data %>%
dplyr::mutate(
vaf = as.numeric(t_alt_count) / (as.numeric(t_ref_count) +
as.numeric(t_alt_count)),
base_change = paste0(Reference_Allele, ">", Allele),
coding = dplyr::case_when(
BIOTYPE != "protein_coding" ~ "non-coding",
TRUE ~ "protein_coding"
)
) %>%
dplyr::mutate(change = dplyr::case_when(
grepl("^-", base_change) ~ "insertion",
grepl("-$", base_change) ~ "deletion",
nchar(base_change) > 3 ~ "long_change",
TRUE ~ base_change
)) %>%
dplyr::mutate(
mutation_id = paste0(
Hugo_Symbol, "_",
change, "_",
Start_Position, "_",
Tumor_Sample_Barcode
),
general_id = paste0(Hugo_Symbol, "_", Tumor_Sample_Barcode)
) %>%
dplyr::select(-which(apply(is.na(.), 2, all)))
NAs introduced by coercion
# Take a look at this df
strelka2_vaf
Now we will do the same for MuTect2.
mutect2_vaf <- mutect2@data %>%
dplyr::mutate(
vaf = as.numeric(t_alt_count) / (as.numeric(t_ref_count) +
as.numeric(t_alt_count)),
base_change = paste0(Reference_Allele, ">", Allele),
coding = dplyr::case_when(
BIOTYPE != "protein_coding" ~ "non-coding",
TRUE ~ "protein_coding"
)
) %>%
dplyr::mutate(change = dplyr::case_when(
grepl("^-", base_change) ~ "insertion",
grepl("-$", base_change) ~ "deletion",
nchar(base_change) > 3 ~ "long_change",
TRUE ~ base_change
)) %>%
dplyr::mutate(
mutation_id = paste0(
Hugo_Symbol, "_",
change, "_",
Start_Position, "_",
Tumor_Sample_Barcode
),
general_id = paste0(Hugo_Symbol, "_", Tumor_Sample_Barcode)
) %>%
dplyr::select(-which(apply(is.na(.), 2, all)))
# Take a look at this df
mutect2_vaf
Save to a TSV file.
# Merge these data.frames together
vaf_df <- strelka2_vaf %>%
dplyr::full_join(mutect2_vaf,
by = "mutation_id",
suffix = c(".strelka2", ".mutect2")
) %>%
# Make a variable that denotes which dataset it is in.
dplyr::mutate(dataset = dplyr::case_when(
is.na(Allele.mutect2) ~ "strelka2_only",
is.na(Allele.strelka2) ~ "mutect2_only",
TRUE ~ "both"
)) %>%
readr::write_tsv(file.path("results", "combined_results.tsv"))
Session Info:
sessionInfo()
R version 3.6.1 (2019-07-05)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Mojave 10.14.5
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] parallel stats graphics grDevices utils datasets methods base
other attached packages:
[1] Biobase_2.44.0 BiocGenerics_0.30.0
loaded via a namespace (and not attached):
[1] pkgload_1.0.2 tidyr_0.8.3 splines_3.6.1 jsonlite_1.6
[5] foreach_1.4.7 assertthat_0.2.1 yaml_2.2.0 remotes_2.1.0
[9] sessioninfo_1.1.1 lattice_0.20-38 pillar_1.4.2 backports_1.1.4
[13] glue_1.3.1 digest_0.6.20 RColorBrewer_1.1-2 colorspace_1.4-1
[17] Matrix_1.2-17 htmltools_0.3.6 plyr_1.8.4 pkgconfig_2.0.2
[21] devtools_2.1.0 bibtex_0.4.2 purrr_0.3.2 xtable_1.8-4
[25] scales_1.0.0 processx_3.4.1 VennDiagram_1.6.20 tibble_2.1.3
[29] pkgmaker_0.27 styler_1.1.1.9003 ggplot2_3.2.1 usethis_1.5.1
[33] withr_2.1.2 lazyeval_0.2.2 cli_1.1.0 survival_2.44-1.1
[37] magrittr_1.5 crayon_1.3.4 memoise_1.1.0 evaluate_0.14
[41] ps_1.3.0 fs_1.3.1 doParallel_1.0.15 NMF_0.21.0
[45] pkgbuild_1.0.4 tools_3.6.1 registry_0.5-1 data.table_1.12.2
[49] prettyunits_1.0.2 hms_0.5.0 formatR_1.7 gridBase_0.4-7
[53] stringr_1.4.0 munsell_0.5.0 cluster_2.1.0 rngtools_1.4
[57] lambda.r_1.2.3 maftools_2.0.15 callr_3.3.1 compiler_3.6.1
[61] rlang_0.4.0 futile.logger_1.4.3 grid_3.6.1 iterators_1.0.12
[65] rstudioapi_0.10 colorblindr_0.1.0 base64enc_0.1-3 rmarkdown_1.14
[69] testthat_2.2.1 gtable_0.3.0 codetools_0.2-16 curl_4.0
[73] rematch2_2.1.0 reshape2_1.4.3 R6_2.4.0 knitr_1.24
[77] dplyr_0.8.3 zeallot_0.1.0 rprojroot_1.3-2 futile.options_1.0.1
[81] readr_1.3.1 desc_1.2.0 stringi_1.4.3 Rcpp_1.0.2
[85] vctrs_0.2.0 wordcloud_2.6 tidyselect_0.2.5 xfun_0.8